#Import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Helpful
import os
# Linear Regression Class
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import max_error
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import SGDRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import seaborn as sns
import missingno as msno
import warnings
warnings.filterwarnings("ignore")
# Metrics - R2
from sklearn.metrics import r2_score
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_rows', 127)
import azureml.core
from azureml.core import Workspace
# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))
from azureml.core import Experiment
# Create an Azure ML experiment in your workspace
experiment = Experiment(workspace=ws, name="LoanStats-workspace")
# Start logging data from the experiment, obtaining a reference to the experiment run
run = experiment.start_logging()
print("Starting experiment:", experiment.name)
print('\n')
# Load the data from a local file
loans = pd.read_csv("LoanStats.csv")
# Count the rows and log the result
row_count = (len(loans))
run.log('row count', row_count)
print('Analyzing {} rows of data'.format(row_count))
# Count the columns and log the result
column_count = (loans.shape[1])
run.log('column count', column_count)
print('Analyzing {} columns of data'.format(column_count))
print('\n')
# Read the first 5 rows
print('First 5 rows in the dataset:')
print(loans.head())
print('\n')
# Check Datatypes
print('Data type of each column:')
run.log('data types of each column', loans.dtypes)
print(loans.dtypes)
# Log summary statistics for numeric columns
num_columns = ['int_rate', 'loan_amnt', 'annual_inc', 'term']
summary_stats = loans[num_columns].describe().to_dict()
for col in summary_stats:
keys = list(summary_stats[col].keys())
values = list(summary_stats[col].values())
for index in range(len(keys)):
run.log_row(col, stat=keys[index], value = values[index])
# Check the Descriptive Statistics
loans.describe()
# Check the Descriptive Statistics for String Data
loans.describe(exclude='number')
When the categorical features are analyzed, grade of the person definitely has a direct impact over the interest rate. purpose and home_ownership features don't demonstrate an exact relation to the interest rate specified. For the impact of loan_amnt, annual_inc or term, a correlation matrix must be visualized. Yet, the graphs don't suggest a strong relationship.
Frequency distributions demonstrate that there is no normal distribution for int_rate, loan_amnt, and annual_inc. Instead, the distributions are right_skewed which suggests that a normalization procedure must be followed in the data preparation stage.
# Visualize the correlations in the dataset by grade
plot = sns.pairplot(loans,hue='grade', diag_kind='kde')
run.log_image(name='Pairplot by grade', plot=plot)
# Check frequency distributions of the numerical features
list_of_features = ['int_rate','loan_amnt','annual_inc','term']
for feat in list_of_features:
distributions = sns.histplot(data=loans, x=feat, kde=True)
plt.show()
# Visualize the impact of grade over interest rate
sns.boxplot(x="grade", y="int_rate", data=loans)
# Visualize the impact of term over interest rate
sns.boxplot(x="term", y="int_rate", data=loans)
# Create the correlation matrix
corr = loans.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set the color palette
cmap = sns.diverging_palette(h_neg=10,
h_pos=240,
as_cmap=True)
# Create the heatmap with the color palette
sns.heatmap(corr, mask=mask,
center=0, cmap=cmap, linewidths=1,
annot=True, fmt=".2f")
plt.show()
Findings: There is no missing value in any column or row. Count value for all columns is 39786.
# Visualize Missing Values
msno.matrix(loans)
# detect the columns with more than 55% null values
null_values = loans.loc[:, loans.isnull().mean() > .55]
print('Number of columns BEFORE removal: ' + str(loans.shape[1]))
# create more than 55% null value columns list and remove them from the dataset
NA_columns = [column for column in null_values.columns]
print('There are {} columns to be removed'.format(len(NA_columns)))
# remove the NA columns
loans.drop(labels=NA_columns, axis=1, inplace=True)
print('Number of columns AFTER removal: ' + str(loans.shape[1]))
to_drop = []
for row in range(len(loans)):
if loans.loc[row,:].isna().sum().sum()/len(loans.columns) > .9:
to_drop.append(loans.loc[row,:])
to_drop = pd.DataFrame(to_drop, columns = loans.columns)
print(' The number of rows where missing values are more than 90%: ' + str(len(to_drop)))
loans = loans[~loans.isin(to_drop)].dropna()
print(' Remaining number of rows AFTER removal: ' + str(len(loans)))
def OutlierDetection(df):
for col in df.columns:
if df[col].dtype != 'object':
upper = df[col].mean() + 3*df[col].std()
lower = df[col].mean() - 3*df[col].std()
df[col] = np.where(df[col] > upper, np.nan, np.where(df[col] < lower, np.nan, df[col]))
print('Number of NaNs BEFORE removing the outliers:')
loans.isnull().sum()
OutlierDetection(loans)
print('Number of NaNs AFTER removing the outliers:')
loans.isnull().sum()
# define a custom function for MICE IMPUTATION
def MiceImputation(df):
MICE_imputer = IterativeImputer(max_iter=10, random_state=0)
numeric_only = df.loc[:, df.dtypes != np.object]
numeric_cols = numeric_only.columns.to_list()
categorical_only = df.loc[:, df.dtypes == np.object]
cat_copy = numeric_only.copy(deep = True)
imputed = MICE_imputer.fit_transform(cat_copy)
imputed2 = pd.DataFrame(imputed, columns= numeric_cols)
df2 = imputed2.join(categorical_only)
return df2
# Apply the imputation function on the numeric columns of loans dataset
loans = MiceImputation(loans)
print('Number of NaNs AFTER MICE imputation:')
loans.isnull().sum()
X = loans.drop('int_rate',axis=1)
y = loans['int_rate']
# Classify the Features
categorical_feature_mask = X.dtypes == 'object'
categorical_features = X.columns[categorical_feature_mask].tolist()
numeric_features = X.columns[~categorical_feature_mask].tolist()
# define the transformers separately
numeric_transformer = MinMaxScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")
# assign the transformers to classified features
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle=True, random_state=5)
# Instantiate the model
LinearReg = LinearRegression()
# Append the model to preprocessing pipeline.
lr = Pipeline(
steps=[("preprocessor", preprocessor), ("model", LinearReg)]
)
# fit the model to the train set
lr.fit(X_train,y_train)
# make predictions on the test set and train set
y_pred_test = lr.predict(X_test)
y_pred_train = lr.predict(X_train)
# Compute and print R^2 and RMSE
print("R^2 for Train Set: {}".format(lr.score(X_train, y_train)))
print("R^2 for Test Set: {}".format(lr.score(X_test, y_test)))
RMSE_test = np.sqrt(mean_squared_error(y_test,y_pred_test))
RMSE_train= np.sqrt(mean_squared_error(y_train,y_pred_train))
print('\n')
print("Root Mean Squared Error for Train Set: {}".format(RMSE_train))
print("Root Mean Squared Error for Test Set: {}".format(RMSE_test))
# extract one-hot encoding applied features
onehot_columns = list(lr.named_steps["preprocessor"].named_transformers_['cat'].get_feature_names(input_features=categorical_features))
numeric_features_list = list(numeric_features)
numeric_features_list.extend(onehot_columns)
# Zip the coefficients with the numeric column names
feature_importance = pd.DataFrame(zip(lr[1].coef_,numeric_features_list), columns = ['coefficient','name'])
feature_importance.sort_values('coefficient', ascending=True)
import plotly.express as px
# visualize the coefficients
fig = px.bar(feature_importance, x='coefficient', y='name')
fig.show()
# Cross Validation with LASSO
lasso = Lasso(random_state=42)
las_pip = Pipeline(
steps=[("preprocessor", preprocessor), ("model", lasso)])
# Cross Validation with RIDGE
ridge = Ridge(random_state=5)
rid_pip = Pipeline(
steps=[("preprocessor", preprocessor), ("model", ridge)])
pipelines = [las_pip,rid_pip]
param_grid = {'model__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]}
for i in range(len(pipelines)):
gcv= GridSearchCV(pipelines[i], param_grid , scoring='r2', cv=5)
gcv.fit(X_train,y_train)
y_pred = gcv.predict(X_test)
RMSE = np.sqrt(mean_squared_error(y_test,y_pred))
print('Used Model: ' + str(pipelines[i][1]))
print('\n')
print("Best parameters found: ",gcv.best_params_)
print("Best R^2 found: {:.3f}".format(np.sqrt(np.abs(gcv.best_score_))))
print("Root Mean Square Error: {:.3f}".format(RMSE))
print('\n')
# GridSearchCV with SGDRegressor
sgd= SGDRegressor(random_state=100)
sgd_pip = Pipeline(
steps=[("preprocessor", preprocessor), ("model", sgd)])
param_grid = {'model__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1],'model__eta0': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
'model__max_iter':[10,100,1000], 'model__loss':['squared_loss'],
'model__penalty':['l1','l2','elasticnet'],'model__learning_rate':['adaptive','optimal']}
gcv= GridSearchCV(sgd_pip, param_grid , scoring='r2', cv=5)
gcv.fit(X_train,y_train)
y_pred = gcv.predict(X_test)
RMSE = np.sqrt(mean_squared_error(y_test,y_pred))
print('-- Used Model: SGDRegressor() --')
print('\n')
print("Best parameters found: ",gcv.best_params_)
print("Best R^2 found: {:.3f}".format(np.sqrt(np.abs(gcv.best_score_))))
print("Root Mean Squared Error: {:.3f}".format(RMSE))
print('\n')
ModelsFinal = pd.DataFrame(gcv.cv_results_)
ModelsFinal.sort_values('rank_test_score', ascending = True)
Among the linear regressors trained here, the best R2 result appears to be received when LASSO regression is deployed with the following hyperparameters:
This is further confirmed through running gridsearchCV with more hyperparameters via SGDRegressor.
Using the above-specified hyperparameters, the best R-squared score is 0.96 while the lowest root mean square error is 1.048. Since our aim is to get the most accurate prediction results for the interest rate column, R2 score have more weight in our analyses.
The most decisive factors for interest rate are: grade, term, and loan amount.
Note: Given the stochastic nature of the algorithms, the results are prone to change on different runs.